#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import random
import time
import matplotlib.pyplot as plt
import pickle
import train
from train import sarsa, PCVaR_Q_Pre_train, CVaR_Q_Pre_train, CVaR_Q_learning, PCVaR_Q_learning 

GRID_HEIGHT = 8
GRID_WIDTH = 10
OBSTACLES = [(5, 2), (5, 5), (2, 4), (6, 7)]
GOAL = (7, 9)
START = (7, 0)

ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] 
NUM_ACTIONS = len(ACTIONS)
H = np.linspace(-150, 100, 251)
gamma = 0.99
threshold = 1e-4
reward_default = -1
reward_goal = 50
reward_obstacle = -50
random_probability = 0.3 
q = 0.1

SEED = 0

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    
def is_valid(pos):
    x, y = pos
    return 0 <= x < GRID_HEIGHT and 0 <= y < GRID_WIDTH

def step(state, action_index):
    if random.random() < random_probability:
        other_actions = list(range(NUM_ACTIONS))
        other_actions.remove(action_index)
        action_index = random.choice(other_actions)

    action = ACTIONS[action_index]
    next_state = (state[0] + action[0], state[1] + action[1])
    done = False
    if not is_valid(next_state):
        next_state = state
        reward = reward_default
    elif next_state in OBSTACLES:
        done = True
        reward = np.random.normal(reward_obstacle, 1)
    elif next_state == GOAL:
        done = True
        reward = np.random.normal(reward_goal, 1)
    else:
        reward = reward_default

    return next_state, reward, done
  
def choose_action_PCVaR(Q_cvar, M, state, idx, epsilon):
    if np.random.rand() < epsilon:
        return random.randint(0, NUM_ACTIONS - 1)  
    else:
        q_values = Q_cvar[state[0], state[1], idx, :] - H[idx]*M[state[0], state[1], idx, :]
        max_q = np.max(q_values)
        max_actions = np.where(q_values == max_q)[0]
        return np.random.choice(max_actions)
      
def get_mean_and_std_from_list(data_list):
    min_len = min(len(d) for d in data_list)  
    data_array = np.array([d[:min_len] for d in data_list])
    mean = data_array.mean(axis=0)
    std = data_array.std(axis=0)
    return data_array, mean, std

set_seed(SEED)
Q_sarsa, rewards_sarsa = sarsa()

set_seed(SEED)
Pre_PCVaR_Q_cvar, Pre_M, Pre_rewards = PCVaR_Q_Pre_train(Q_sarsa, 50000)
eta_RN = np.quantile(Pre_rewards, q)

set_seed(SEED)
Pre_CVaR_Q_cvar, Pre_rewards = CVaR_Q_Pre_train(Q_sarsa, 50000)
eta_RN = np.quantile(Pre_rewards, q)

PCVaR_Q_list = []
M_list = []
PCVaR_Q_cvar_hist_list = []

CVaR_Q_list = []
CVaR_Q_cvar_hist_list = []

num_episodes = 15000

for SEED in range(5):
    set_seed(SEED)
    
    Q_cvar, M, cvar_hist = PCVaR_Q_learning(
        np.copy(Pre_PCVaR_Q_cvar), 
        np.copy(Pre_M)
    )

    PCVaR_Q_list.append(Q_cvar)
    M_list.append(M)
    PCVaR_Q_cvar_hist_list.append(cvar_hist)
    
    set_seed(SEED)
    
    Q_cvar, cvar_hist = CVaR_Q_learning(
        np.copy(Pre_CVaR_Q_cvar)
    )

    CVaR_Q_list.append(Q_cvar)
    CVaR_Q_cvar_hist_list.append(cvar_hist)
    
with open('PCVaR_Q_list.pickle', 'wb') as f:
    pickle.dump(PCVaR_Q_list, f)
with open('M_list.pickle', 'wb') as f:
    pickle.dump(M_list, f)
with open('PCVaR_Q_cvar_hist_list.pickle', 'wb') as f:
    pickle.dump(PCVaR_Q_cvar_hist_list, f)

with open('CVaR_Q_list.pickle', 'wb') as f:
    pickle.dump(CVaR_Q_list, f)
with open('CVaR_Q_cvar_hist_list.pickle', 'wb') as f:
    pickle.dump(CVaR_Q_cvar_hist_list, f)
    
set_seed(0)
rewards_cvar = []
eta = 9
for iter in range(50000):
    state = START
    done = False
    eta_t_idx = int(eta) + 150
    total_reward = 0 
    t = 0
    while not done:
        action = choose_action_PCVaR(PCVaR_Q_list[0], M_list[0], state, eta_t_idx, 0.0)
        next_state, reward, done = step(state, action)
        total_reward += reward
        eta_t_idx =  np.clip(eta_t_idx - int(round(reward)), 0, len(H) -1) 
        state = next_state
        t += 1
        if t > 1000:
            done = True
    rewards_cvar.append(total_reward)
    
rewards_cvar = np.array(rewards_cvar)
rewards_RN = Pre_rewards
var_RN = np.percentile(rewards_RN, q * 100)
cvar_RN = np.mean(rewards_RN[rewards_RN<= var_RN])
var_cvar = np.percentile(rewards_cvar, q * 100)
cvar_cvar = np.mean(rewards_cvar[rewards_cvar<= var_cvar])

plt.figure(figsize=(12, 6))
plt.hist(rewards_RN, bins=30, alpha=0.4, label='RN', color='royalblue', edgecolor='black', density=True)
plt.hist(rewards_cvar, bins=30, alpha=0.6, label='PCVaR-Q', color='darkorange', edgecolor='black', density=True)
plt.axvline(cvar_RN, color='blue', linestyle='--', linewidth=2, label='CVaR of RN ')
plt.axvline(cvar_cvar, color='red', linestyle='--', linewidth=2, label='CVaR of PCVaR-Q')
plt.xlabel('Reward')
plt.ylabel('Density')
plt.legend(fontsize = 16)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig("Figure 3 (a).png", dpi=300) 
plt.close()

seeds = [0, 1, 2, 3, 4]
cvar_runs, cvar_mean, cvar_std = get_mean_and_std_from_list(CVaR_Q_cvar_hist_list)
pcvar_runs, pcvar_mean, pcvar_std = get_mean_and_std_from_list(PCVaR_Q_cvar_hist_list)
x = np.arange(1000, 1000 * (len(cvar_mean) + 1), 1000)

plt.figure(figsize=(12, 6))
for i in range(len(seeds)):
    plt.plot(x, pcvar_runs[i][:len(x)], linestyle='--', color='darkorange', alpha=0.3)
    plt.plot(x, cvar_runs[i][:len(x)], linestyle='--', color='royalblue', alpha=0.3)
plt.plot(x, pcvar_mean, color='darkorange', marker='o', linewidth=2.5, label='PCVaR-Q')
plt.fill_between(x, pcvar_mean - pcvar_std, pcvar_mean + pcvar_std,
                 color='darkorange', alpha=0.2)

plt.plot(x, cvar_mean, color='royalblue', marker='s', linewidth=2.5, label='CVaR-Q')
plt.fill_between(x, cvar_mean - cvar_std, cvar_mean + cvar_std,
                 color='royalblue', alpha=0.2)
plt.axhline(y=-52.34, color='red', linestyle='--', linewidth=2, label='Opt')
plt.xlabel("Training Iteration", fontsize=12)
plt.ylabel("Estimated CVaR", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(fontsize=14, loc='lower right')
plt.tight_layout()
plt.savefig("Figure 3 (b).png", dpi=300) 
plt.close()

